#### Prepare observations dataset with coffee outcomes and weather

## Prepare environment

setwd("~/research/coffee-dataverse/src")

library(dplyr)
library(reshape2)
library(ggplot2)

## Collect totals
df.header <- read.csv("../data/coffee-produced-total.csv", nrows=1)
years <- names(df.header)[-(1:3)]
df.data <- read.csv("../data/coffee-produced-total.csv", skip=1)
names(df.data)[4:ncol(df.data)] <- years
df.data2 <- melt(df.data, id.vars=c('N�vel', 'C�d.', 'Munic�pio'))
df.data2$year <- sapply(df.data2$variable, function(var) as.numeric(substring(var, 2, 5)))
names(df.data2)[1:4] <- c('Level', 'Code', 'Munic�pio', 'ignore')

df.total <- df.data2[, -4]
df.total$value <- as.numeric(df.total$value)

## Collect by variety
df.header <- read.csv("../data/coffee-produced.csv", nrows=1)
crops <- t(df.header[1, 3:4]) # note that shifted from col names
years <- names(df.header)[3 + seq(1, 91, by=2)]
df.data <- read.csv("../data/coffee-produced.csv", skip=1)
names(df.data)[4:ncol(df.data)] <- paste0(rep(crops, 6), "XX", rep(years, each=length(crops)))
df.data2 <- melt(df.data, id.vars=c('N�vel', 'C�d.', 'Munic�pio'))
df.data2$variable <- as.character(df.data2$variable)
df.data2$crop <- sapply(df.data2$variable, function(var) strsplit(var, "XXX")[[1]][1])
df.data2$year <- sapply(df.data2$variable, function(var) as.numeric(strsplit(var, "XXX")[[1]][2]))
names(df.data2)[1:4] <- c('Level', 'Code', 'Munic�pio', 'ignore')

df.byvar <- df.data2[, -4]
df.byvar$value <- as.numeric(df.byvar$value)

df <- df.total %>% left_join(subset(df.byvar, crop == "Caf� (em gr�o) Ar�bica"), by=c('Level', 'Code', 'Munic�pio', 'year'), suffix=c('.total', '.arabica')) %>% left_join(subset(df.byvar, crop == "Caf� (em gr�o) Canephora"), by=c('Level', 'Code', 'Munic�pio', 'year'), suffix=c('', '.robusta'))
df <- df[, c(1:3, 5, 4, 6, 8)]
names(df)[5:7] <- c('total', 'arabica', 'robusta')
df <- df[!is.na(df$total) | !is.na(df$arabica) | !is.na(df$robusta),]

## Account for change in how recorded before 2001
df$total[df$year <= 2001] <- df$total[df$year <= 2001] * .5

## Collect harvesting
df.header <- read.csv("../data/coffee-harvested-total.csv", nrows=1)
years <- names(df.header)[-(1:3)]
df.data <- read.csv("../data/coffee-harvested-total.csv", skip=1)
names(df.data)[4:ncol(df.data)] <- years
df.data2 <- melt(df.data, id.vars=c('N�vel', 'C�d.', 'Munic�pio'))
df.data2$year <- sapply(df.data2$variable, function(var) as.numeric(substring(var, 2, 5)))
names(df.data2)[1:4] <- c('Level', 'Code', 'Munic�pio', 'ignore')

df.harvest <- df.data2[, -4]
df.harvest$value <- as.numeric(df.harvest$value)

## Collect harvesting by variety
df.brazil2 <- subset(read.csv("../data/brazil-coffee.csv"), metric == 'harvested')
df.brazil3 <- data.frame(region=df.brazil2$region, code=df.brazil2$code, crop=df.brazil2$crop, year=df.brazil2$year, value=as.numeric(df.brazil2$value))

df.harvest2 <- df.harvest %>% left_join(subset(df.brazil3, crop == 'caf� (em gr�o) ar�bica'), by=c('Code'='code', 'Munic�pio'='region', 'year'), suffix=c('.total', '.arabica')) %>% left_join(subset(df.brazil3, crop == 'caf� (em gr�o) canephora'), by=c('Code'='code', 'Munic�pio'='region', 'year'), suffix=c('', '.robusta'))

df.harvest2 <- df.harvest2[, c(1:3, 5, 4, 7, 9)]
names(df.harvest2)[5:7] <- c('total', 'arabica', 'robusta')
df.harvest2 <- df.harvest2[!is.na(df.harvest2$total) | !is.na(df.harvest2$arabica) | !is.na(df.harvest2$robusta),]

df2 <- df %>% left_join(df.harvest2, c('Level', 'Code', 'Munic�pio', 'year'), suffix=c('.produce', '.harvest'))

## File contains outcome data
write.csv(df2, "../data/observations-coffee.csv", row.names=F)

### Combine with weather data
## Reset the environment

setwd("~/research/coffee-dataverse/src")

do.allyears <- T

library(ggplot2)
library(dplyr)

## Load the data

df.weather <- read.csv("../data/weather-brazil-sachs2.csv")
df.prod <- read.csv("../data/observations-coffee.csv", fileEncoding='utf-8')
maxpoints <- read.csv("../data/brazil-maxpoints.csv")

if (do.allyears) {
    df.prod2 <- cbind(Level="MU", Code=NA, expand.grid(Munic�pio=unique(df.prod$Munic�pio), year=min(df.prod$year):max(df.prod$year))) %>% left_join(df.prod, by=c('Munic�pio', 'year'), suffix=c('', '.drop'))
    df.prod <- df.prod2[, -grep("drop", names(df.prod2))]
}

df <- df.prod %>% left_join(maxpoints, by=c('Munic�pio'='region'))
df <- df[, -which(names(df) %in% c('allowdist', 'shpname'))]

## Remove extra digits, so R == python
df$longitude <- round(df$longitude, 6)
df$latitude <- round(df$latitude, 6)
df.weather$lon <- round(df.weather$lon, 6)
df.weather$lat <- round(df.weather$lat, 6)
df.weather <- df.weather[!duplicated(df.weather[, c('year', 'lon', 'lat')]),]

df2 <- df %>% left_join(df.weather, by=c('year', 'longitude'='lon', 'latitude'='lat'))
df2$year <- df2$year - 1
df3 <- df2 %>% left_join(df.weather, by=c('year', 'longitude'='lon', 'latitude'='lat'), suffix=c('', '-1'))
df3$year <- df3$year + 1

## Write out final observations file
if (do.allyears) {
    write.csv(df3, "../data/observations-full-all.csv", row.names=F)
} else {
    write.csv(df3, "../data/observations-full.csv", row.names=F)
}
